# Load all necessary packages here
library(tidyverse)
library(readxl)
library(tigris)
library(leaflet)
library(htmltools)
library(htmlwidgets)
library(scales)
library(plotly)
library(moderndive)
library(leaflet.extras)
library(captioner)# create a vector of all the states in the West region of the U.S.
West = c("WA", "OR", "ID", "MT", "WY", "CO",
"UT", "NV", "CA", "AK", "HI")
# create a vector of all the states in the Southwest region of the U.S.
Southwest = c("AZ", "NM", "TX", "OK")
# create a vector of all the states in the Southeast region of the U.S.
Southeast = c("AR", "LA", "MS", "AL", "GA", "FL", "SC",
"TN", "NC", "KY", "VA", "WV", "DC", "DE", "MD")
# create a vector of all the states in the Northeast region of the U.S.
Northeast = c("NJ", "CT", "RI", "PA", "NY", "MA", "NH", "VT", "ME")
# create a vector of all the states in the Midwest region of the U.S.
Midwest = c("ND", "SD", "NE", "KS", "MO", "IA", "MN",
"WI", "IL", "IN", "OH", "MI")
# make a dataframe for our first dataset (poverty and household income)
# filter for year 2010
# select variables of importance
poverty_income <- read_excel("state_series_1980-2014.xls") %>%
filter(year == 2010) %>%
select(fips, state_code, median_hhinc, percent_pov, total_pop)
# make a dataframe for our second dataset (dropout rates)
# select variables of importance
# to get ready to join: rename a column
# and rename other variables we are using
dropout_rates <- read_excel("sdr091a.xls") %>%
select(FIPST, STATENAME, DRP912,
TOTDHI, TOTDBL, TOTDWH, TOTDAS, TOTDAM) %>%
rename(fips = FIPST, state_name = STATENAME, dropouts = DRP912,
total_hispanic = TOTDHI, total_black = TOTDBL, total_white = TOTDWH,
total_asian = TOTDAS, total_american = TOTDAM)
# join by FIPS state numeric code
# remove District of Columbia
# add a new variable called "region" to the dataframe based on what region the state is in
dropout_data <- inner_join(dropout_rates, poverty_income, by = "fips") %>%
filter(!(state_name == "District of Columbia")) %>%
mutate(region = case_when(state_code %in% West ~ "West",
state_code %in% Southwest ~ "Southwest",
state_code %in% Southeast ~ "Southeast",
state_code %in% Northeast ~ "Northeast",
state_code %in% Midwest ~ "Midwest"))In order to do this comparison and analysis, we used the state dropout data1 from the Common Core of Data (CCD)2 surveys that are submitted every year to the National Center for Education Statistics (NCES). Additionally, we used demographic and economic data3. This data came from the U.S. Bureau of Labor Statistics Local Area Statistics Project, U.S. Census Bureau Small Area Income and Poverty Estimates, and U.S. Census Bureau Population and Housing Estimates. We chose to focus on state, region, race, median household income, poverty level, and total resident population (all people who are usually residents of a specific state4).
Our barplot allows for analysis of dropouts by race, our interactive map provides a closer view on each state’s statistics, and our simple and multiple regression models give an insight on how poverty levels and income influence dropout rates. Check this footnote5 to explore the interactive features for each figure.
We created a stacked barplot6 and chose one common way to divide the United States into five regions7.
# group by region
# count: number of Hispanic drop outs, number of Black drop outs,
# number of White drop outs, number of Asian/Hawaiian Native/Pacific Islander drop outs,
# number of American Indian/Alaska Native drop outs, total population
barplotData <- dropout_data %>%
group_by(region) %>%
summarize(sum_hispanic = sum(total_hispanic), sum_black = sum(total_black),
sum_white = sum(total_white), sum_asian = sum(total_asian),
sum_american = sum(total_american), sum_pop = sum(total_pop))
# make stacked barplot (a stack for each race)
plot_ly(data = barplotData, x = ~region, y = ~sum_hispanic,
type = 'bar', name = 'Hispanic',
text = paste("Total Region Population:", comma(barplotData$sum_pop)),
marker = list(color = 'rgb(0,0,128)')) %>%
add_trace(y = ~sum_black, name = 'Black',
marker = list(color = 'rgb(30,144,255)')) %>%
add_trace(y = ~sum_white, name = 'White',
marker = list(color = 'rgb(135,206,250)')) %>%
add_trace(y = ~sum_asian, name = 'Asian/Hawaiian Native/Pacific Islander',
marker = list(color = 'rgb(0,191,255)')) %>%
add_trace(y = ~sum_american, name = 'American Indian/Alaska Native',
marker = list(color = 'rgb(20, 106, 162)')) %>%
layout(title ="Total High School Dropouts by Race in the 2009-2010 School Year",
yaxis = list(title = 'Number of Dropouts', tickformat = ",d"),
xaxis = list(title = 'U.S. Region', categoryorder = "array",
categoryarray = c("West", "Southeast", "Midwest",
"Southwest", "Northeast")),
barmode = 'stack',
legend = list(x = 100, y = 0.5),
annotations = list(yref = 'paper', xref = 'paper', y = 0.65, x = 1.13,
text = "Race", showarrow = F)) %>%
config(displayModeBar = FALSE)The descending order of the barplot8 allows us to easily see that the West has the largest number of dropouts and the Northeast has the fewest. This could be due to the fact that the West has a larger population than the Northeast. However, this reasoning doesn’t explain why the West has more dropouts than the Southeast or Midwest since both of these regions have larger populations than the West.
Among all races, Hispanics have the highest number of dropouts in the West and Southwest, while Whites have the highest number in the Southeast, Midwest, and Northeast; again, this could be because of the concentration of these races in specific regions. Overall, if we consider the total population of dropouts across states, we see that American Indians (12,004) and Asians (13,794) have the lowest number of dropout across races and regions, and that Whites (191,916) and Hispanics (149,990) have the highest.
Our map9 is colored by 2009-2010 dropout rates.
# load spatial data
states <- states()
# inner join spatial data and a dataframe
states_merged <- geo_join(states, dropout_data, "STUSPS", "state_code", how = "inner")
# make blue color palette based on the range of dropout rate numbers
pal_dropouts <- colorNumeric("Blues", domain=states_merged$dropouts)
# make popup labels
popup_label <- paste0("<strong>", states_merged$NAME,
"</strong><br />Total Population: ",
comma(states_merged$total_pop),
"<br />Dropout Rate: ",
paste(format(round(states_merged$dropouts, 2), nsmall = 2), "%",
sep = ""),
"<br />Percent in Poverty: ",
paste(states_merged$percent_pov, "%", sep = ""),
"<br />Median Household Income: ",
comma(states_merged$median_hhinc))# make interactive map
# at start: center the map on the U.S.
# add icon to reset map to zoom level 4, centered on U.S.
leaflet(states) %>%
addProviderTiles("CartoDB.Positron") %>%
setView(-98.483330, 38.712046, zoom = 4) %>%
addPolygons(data = states_merged,
fillColor = ~pal_dropouts(states_merged$dropouts),
fillOpacity = 0.7,
weight = 0.2,
smoothFactor = 0.2,
highlight = highlightOptions(weight = 5, color = "#666",
fillOpacity = 0.7, bringToFront = TRUE),
popup = ~popup_label,
label = states_merged$NAME) %>%
addLegend(pal = pal_dropouts,
values = states_merged$dropouts,
position = "bottomright",
title = "Dropout Rate",
labFormat = labelFormat(suffix = "%")) %>%
addResetMapButton()